fuente: https://github.com/DiegoKoz/MIA_text_mining
Warning: Este dataset es muy pesado. Eso implica que lleva tiempo correr los modelos y que puede no entrar en la memoria de la computadora. Para la clase, se puede hacer un muestreo de textos para que no pese tanto
library(tidyverse)
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.2.0 [32m✔[30m [34mpurrr [30m 0.3.2
[32m✔[30m [34mtibble [30m 2.1.3 [32m✔[30m [34mdplyr [30m 0.8.3
[32m✔[30m [34mtidyr [30m 0.8.3 [32m✔[30m [34mstringr[30m 1.4.0
[32m✔[30m [34mreadr [30m 1.3.1 [32m✔[30m [34mforcats[30m 0.4.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mpurrr[30m::[32mis_null()[30m masks [34mtestthat[30m::is_null()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31m✖[30m [34mdplyr[30m::[32mmatches()[30m masks [34mtestthat[30m::matches()[39m
library(glue)
Attaching package: ‘glue’
The following object is masked from ‘package:dplyr’:
collapse
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
library(topicmodels)
library(tidytext)
library(stringi)
library(LDAvis)
library(slam)
library(tsne)
library(lubridate)
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(DT)
library(lsa)
Loading required package: SnowballC
library(igraph)
Attaching package: ‘igraph’
The following objects are masked from ‘package:lubridate’:
%--%, union
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:purrr’:
compose, simplify
The following object is masked from ‘package:tidyr’:
crossing
The following object is masked from ‘package:tibble’:
as_data_frame
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:testthat’:
compare
The following object is masked from ‘package:base’:
union
library(ggraph)
library(tidygraph)
Attaching package: ‘tidygraph’
The following object is masked from ‘package:igraph’:
groups
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:testthat’:
matches
df <- read_csv('../data/txt/texto_limpio.txt')
Parsed with column specification:
cols(
tipo = [31mcol_character()[39m,
autor = [31mcol_character()[39m,
titulo = [31mcol_character()[39m,
texto = [31mcol_character()[39m,
link = [31mcol_character()[39m,
link1 = [31mcol_character()[39m,
fecha = [32mcol_double()[39m,
id = [31mcol_character()[39m
)
# df <- read_rds('data/MIA.RDS')
df <- df %>%
filter(tipo=='notas')
df <- df %>%
mutate(texto = tolower(texto),
texto = stri_trans_general(texto, "Latin-ASCII"),
texto = str_trim(texto,side = 'both'),
texto = str_replace_all(texto,'\t',' '),
texto = str_replace_all(texto,'\n',' '),
texto = str_replace_all(texto,'\r',' '),
texto = str_replace_all(texto,'[[:punct:]]',' '),
texto = str_remove_all(texto,'\\d'),
# texto = str_replace_all(texto,'\\d','NUM'),
# texto = str_replace_all(texto,'(NUM)+','NUM'),
texto = str_replace_all(texto,"\\s+", " "))
Para topic modeling las palabras comunes de la lengua generan mucho ruido y terminan predominnado en los topicos.
Vamos a eliminar no solo las Stop Words, sino también las palabras más utilizadas en el español que no están relacionadas con nuestra temática. Para eso, tenemos un archivo r_words.txt donde pusimos todas las palabras más comunes.
Además, aprovechamos para eliminar los tokens que quedaron del scrapeo que en realidad son parte del código html (ver final del archivo).
¿de donde salieron estos tokens? En una primera iteración del LDA, uno de los tópicos que se armó era de código html.
palabras_comunes <- read_csv(file = 'data/r_words.txt',col_names = F)
Parsed with column specification:
cols(
X1 = [31mcol_character()[39m
)
palabras_comunes <-stri_trans_general(palabras_comunes$X1, "Latin-ASCII") # le tengo que hacer la misma transformacion que al texto
stop_words <- stri_trans_general(stopwords(kind = "es"), "Latin-ASCII")
palabras_eliminar <- unique(c(stop_words,palabras_comunes))
rm(stop_words)
rm(palabras_comunes)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2213246 118.3 4018710 214.7 4018710 214.7
Vcells 12067907 92.1 40243535 307.1 33467399 255.4
Corpus = VCorpus(VectorSource(df$texto))
Corpus = tm_map(Corpus, removeWords, palabras_eliminar)
# Corpus <- tm_map(Corpus, stemDocument, language = "spanish") # Corpus
dtm <- DocumentTermMatrix(Corpus)
rm(Corpus)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2348239 125.5 4018710 214.7 4018710 214.7
Vcells 16995397 129.7 40243535 307.1 39272344 299.7
# tm::nTerms(dtm)
#elimino los docuemntos vacios
# rowTotals <- rowSums(as.matrix(dtm))
# nDocs(dtm)
# dtm <- dtm[rowTotals> 0, ]
# nDocs(dtm)
write_rds(dtm, 'data/dtm_MIA.rds')
# df <- df[which(rowTotals>0),] #%>% #tengo que eliminar ese docuemnto que estaba vacio
dtm <- read_rds('data/dtm_MIA.rds')
limpio la memoria porque ya no me queda espacio
rm(palabras_eliminar)
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2348089 125.5 4018710 214.7 4018710 214.7
Vcells 16995237 129.7 40243535 307.1 39272344 299.7
lda_fit <- LDA(dtm, k = 20,method = "Gibbs", control = list(delta=0.6,seed = 1234))
lda_fit
A LDA_Gibbs topic model with 20 topics.
saveRDS(lda_fit, 'modelos/MIA_lda20.rds') # Tarda mucho en correr, asi que guardamos los resultados
lda_fit <- read_rds('modelos/MIA_lda20.rds')
Terms <- terms(lda_fit, 10)
Terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
[1,] "tratamiento" "lenin" "mitin" "hombre" "gobierno" "gramsci" "escritos" "stalin" "anapo" "convocado"
[2,] "aplicables" "carta" "satelite" "ser" "pueblo" "cuadernos" "oposicion" "trotsky" "colombiana" "judiciales"
[3,] "link" "tomo" "descargar" "hombres" "chile" "ordine" "internacional" "moscu" "lleras" "meses"
[4,] "nestor" "libro" "formato" "sociedad" "pais" "nuovo" "trotsky" "gpu" "mrl" "und"
[5,] "paradojas" "editorial" "link" "vida" "trabajadores" "antonio" "partido" "comite" "caudillismo" "vino"
[6,] "significado" "revista" "parisino" "mujeres" "popular" "ever" "stalin" "oposicion" "anapistas" "absuelto"
[7,] "acosado" "cartas" "reconquista" "sino" "presidente" "aga" "izquierda" "central" "anapista" "aclamada"
[8,] "agradaria" "ulianova" "abandonar" "mujer" "companeros" "distorsion" "carta" "lenin" "valencia" "actualidad"
[9,] "aprension" "obras" "acepta" "social" "nacional" "pci" "sovietica" "anos" "bipartidismo" "afirmais"
[10,] "blind" "primera" "aplicara" "mundo" "ser" "carcel" "comintern" "juicio" "pinilla" "antiguedad"
Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
[1,] "partido" "bolivia" "islamistas" "eisermann" "trabajo" "espana" "china" "lula" "indice"
[2,] "revolucion" "casa" "islam" "enviadas" "produccion" "espanola" "mao" "formulaba" "organo"
[3,] "lucha" "mariategui" "islamismo" "humor" "economia" "poum" "chino" "introducido" "pitt"
[4,] "politica" "carlos" "musulmanes" "pertenencias" "capitalista" "cataluna" "tse" "plato" "reuniones"
[5,] "clase" "hombres" "jomeini" "pottier" "industria" "barcelona" "tung" "abandonaran" "aplastando"
[6,] "guerra" "lora" "mezquitas" "reciba" "capital" "espanol" "albania" "agencias" "conservado"
[7,] "proletariado" "noche" "arabia" "rompiendo" "sistema" "madrid" "rda" "alejados" "dada"
[8,] "masas" "villa" "fundamentalismo" "www" "campesinos" "mfa" "chinos" "american" "dobles"
[9,] "poder" "oro" "fis" "abarca" "propiedad" "nin" "kuomintang" "aparentemente" "especializado"
[10,] "obreros" "nota" "islamica" "aisladas" "economica" "cnt" "tsetung" "apoyaremos" "gordos"
Topic 20
[1,] "ejercito"
[2,] "frente"
[3,] "rojo"
[4,] "militares"
[5,] "guerra"
[6,] "militar"
[7,] "sovietico"
[8,] "rusia"
[9,] "campesinos"
[10,] "pueblo"
diccionario <- tibble(
n_topico = 1:20,
nombre_topico = c('perestroika','formato','interna_bolche','pcf','expulsados','orwell','?','althusser','turcos','bolivia','longuet','ejercito','chile','españa','indoamericana','lenin','argentina','china','trabajo_prod','partido')
)
Visualizacion
topicmodels_json_ldavis <- function(fitted, dtm){
svd_tsne <- function(x) tsne(svd(x)$u)
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(dtm)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
mds.method = svd_tsne,
plot.opts = list(xlab="tsne", ylab=""),
doc.length = as.vector(table(dtm$i)),
term.frequency = term_freq)
return(json_lda)
}
json_res <- topicmodels_json_ldavis(lda_fit, dtm)
sigma summary: Min. : 33554432 |1st Qu. : 33554432 |Median : 33554432 |Mean : 33554432 |3rd Qu. : 33554432 |Max. : 33554432 |
Epoch: Iteration #100 error is: 9.2235707542836
Epoch: Iteration #200 error is: 0.575966219423543
Epoch: Iteration #300 error is: 0.425676483504934
Epoch: Iteration #400 error is: 0.415121366960652
Epoch: Iteration #500 error is: 0.412188213579679
Epoch: Iteration #600 error is: 0.411756026593054
Epoch: Iteration #700 error is: 0.411342066613491
Epoch: Iteration #800 error is: 0.410683708963011
Epoch: Iteration #900 error is: 0.409861178035016
Epoch: Iteration #1000 error is: 0.409529777397322
serVis(json_res,as.gist = T,open.browser = T)
Loading required namespace: gistr
dist_topicos <- df %>%
select(autor, fecha) %>%
bind_cols(as_tibble(as.matrix(posterior(lda_fit)$topics)))
dist_topicos <- dist_topicos %>%
group_by(autor) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
mutate(fecha=round(fecha))
# names(dist_topicos)[3:22] <- diccionario$nombre_topico
dist_topicos %>%
filter(!is.na(fecha)) %>%
datatable(., filter = 'top',extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('excel', "csv", "copy", "pdf"), pageLength = 20, autoWidth = TRUE),rownames= FALSE) %>%
# formatPercentage(diccionario$nombre_topico, 2) %>%
# formatStyle(diccionario$nombre_topico, background = styleColorBar(c(0,1), 'deepskyblue')) %>%
# formatStyle(diccionario$nombre_topico,
formatPercentage(3:22, 2) %>%
formatStyle(3:22, background = styleColorBar(c(0,1), 'deepskyblue')) %>%
formatStyle(3:22,
backgroundSize = '98% 60%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center')
NA
topicos_tsne <- tsne(dist_topicos[3:22],k = 2)
sigma summary: Min. : 0.425038112521257 |1st Qu. : 0.562747476200782 |Median : 0.637153108892315 |Mean : 0.668023052516373 |3rd Qu. : 0.745184867412922 |Max. : 1.34741741045709 |
Epoch: Iteration #100 error is: 14.9987118821728
Epoch: Iteration #200 error is: 0.640286676321664
Epoch: Iteration #300 error is: 0.616030833552023
Epoch: Iteration #400 error is: 0.601395297482975
Epoch: Iteration #500 error is: 0.601375063151687
Epoch: Iteration #600 error is: 0.601374641536464
Epoch: Iteration #700 error is: 0.601374635592858
Epoch: Iteration #800 error is: 0.601374635212809
Epoch: Iteration #900 error is: 0.60137463520195
Epoch: Iteration #1000 error is: 0.601374635201821
topicos_tsne <- as_tibble(topicos_tsne,.name_repair = ~glue('tsne_proj_{c(1,2)}'))
grafico <- dist_topicos %>% bind_cols(topicos_tsne) %>%
ggplot(aes(tsne_proj_1,tsne_proj_2, label=autor, color=fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1850)
plotly::ggplotly(grafico)
dist_topicos %>% bind_cols(topicos_tsne) %>%
filter(!is.na(fecha), fecha>1800) %>%
ggplot(aes(tsne_proj_1,tsne_proj_2, label=autor, color=fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1940)
topicos_pca <- princomp(dist_topicos[3:22], cor = TRUE)
dist_topicos %>% bind_cols(as_tibble(topicos_pca$scores[,1:2])) %>%
ggplot(aes(Comp.1,Comp.2, label=autor, color = fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1850)
NA
NA
dist_topicos %>% bind_cols(as_tibble(topicos_pca$scores[,1:2])) %>%
filter(!is.na(fecha),fecha>1800) %>%
ggplot(aes(Comp.1,Comp.2, label=autor, color = fecha)) +
geom_text()+
theme_minimal()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1940)
dist_topicos <- df %>%
select(autor, fecha) %>%
bind_cols(as_tibble(as.matrix(posterior(lda_fit)$topics)))
# names(dist_topicos)[3:22] <- diccionario$nombre_topico
plot <- dist_topicos %>%
select(-autor) %>%
mutate(fecha= round(fecha, -1)) %>%
group_by(fecha) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
filter(!is.na(fecha), fecha>1800) %>%
gather(topico, valor,2:21) %>%
mutate(topico=factor(topico)) %>%
ggplot(aes(fecha, valor, group=topico, color=topico, fill=topico))+
geom_line()+
scale_x_continuous(breaks = scales::pretty_breaks(10))+
# directlabels::geom_dl(aes(label = topico), method=list("top.qp", cex = .75))+
theme_minimal()
theme(legend.position = 'none')
List of 1
$ legend.position: chr "none"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
plotly::ggplotly(plot)
obs: El dataset tiene mucho de Allende, Tópico 6. Tal vez habria que subsamplear.
Terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
[1,] "tratamiento" "lenin" "mitin" "hombre" "gobierno" "gramsci" "escritos" "stalin" "anapo" "convocado"
[2,] "aplicables" "carta" "satelite" "ser" "pueblo" "cuadernos" "oposicion" "trotsky" "colombiana" "judiciales"
[3,] "link" "tomo" "descargar" "hombres" "chile" "ordine" "internacional" "moscu" "lleras" "meses"
[4,] "nestor" "libro" "formato" "sociedad" "pais" "nuovo" "trotsky" "gpu" "mrl" "und"
[5,] "paradojas" "editorial" "link" "vida" "trabajadores" "antonio" "partido" "comite" "caudillismo" "vino"
[6,] "significado" "revista" "parisino" "mujeres" "popular" "ever" "stalin" "oposicion" "anapistas" "absuelto"
[7,] "acosado" "cartas" "reconquista" "sino" "presidente" "aga" "izquierda" "central" "anapista" "aclamada"
[8,] "agradaria" "ulianova" "abandonar" "mujer" "companeros" "distorsion" "carta" "lenin" "valencia" "actualidad"
[9,] "aprension" "obras" "acepta" "social" "nacional" "pci" "sovietica" "anos" "bipartidismo" "afirmais"
[10,] "blind" "primera" "aplicara" "mundo" "ser" "carcel" "comintern" "juicio" "pinilla" "antiguedad"
Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
[1,] "partido" "bolivia" "islamistas" "eisermann" "trabajo" "espana" "china" "lula" "indice"
[2,] "revolucion" "casa" "islam" "enviadas" "produccion" "espanola" "mao" "formulaba" "organo"
[3,] "lucha" "mariategui" "islamismo" "humor" "economia" "poum" "chino" "introducido" "pitt"
[4,] "politica" "carlos" "musulmanes" "pertenencias" "capitalista" "cataluna" "tse" "plato" "reuniones"
[5,] "clase" "hombres" "jomeini" "pottier" "industria" "barcelona" "tung" "abandonaran" "aplastando"
[6,] "guerra" "lora" "mezquitas" "reciba" "capital" "espanol" "albania" "agencias" "conservado"
[7,] "proletariado" "noche" "arabia" "rompiendo" "sistema" "madrid" "rda" "alejados" "dada"
[8,] "masas" "villa" "fundamentalismo" "www" "campesinos" "mfa" "chinos" "american" "dobles"
[9,] "poder" "oro" "fis" "abarca" "propiedad" "nin" "kuomintang" "aparentemente" "especializado"
[10,] "obreros" "nota" "islamica" "aisladas" "economica" "cnt" "tsetung" "apoyaremos" "gordos"
Topic 20
[1,] "ejercito"
[2,] "frente"
[3,] "rojo"
[4,] "militares"
[5,] "guerra"
[6,] "militar"
[7,] "sovietico"
[8,] "rusia"
[9,] "campesinos"
[10,] "pueblo"
dist_topicos_autor <- dist_topicos %>%
group_by(autor) %>%
summarise_all(~mean(.x, na.rm = T)) %>%
mutate(fecha=round(fecha))
adjMat = cosine(t(as.matrix(dist_topicos_autor[,3:21])))
colnames(adjMat) <- dist_topicos_autor$autor
rownames(adjMat) <- dist_topicos_autor$autor
adjMat[1:5,1:5]
Albert Einstein Albert Mathiez Albert Rhys Williams Alberto Flores Galindo Alejandra Kollontai
Albert Einstein 1.0000000 0.5177958 0.3898993 0.3792780 0.9242230
Albert Mathiez 0.5177958 1.0000000 0.8931141 0.8766538 0.7893882
Albert Rhys Williams 0.3898993 0.8931141 1.0000000 0.9291002 0.6906881
Alberto Flores Galindo 0.3792780 0.8766538 0.9291002 1.0000000 0.6673232
Alejandra Kollontai 0.9242230 0.7893882 0.6906881 0.6673232 1.0000000
fivenum(adjMat)
[1] 0.0390587 0.4391939 0.7288184 0.9135718 1.0000000
#la paso a dicotomica, no quiero que me quede muy densa, asi que pongo como punto de corte un valor alto
adjMat[adjMat>0.9] <- 1
adjMat[adjMat<0.9] <- 0
adjMat[1:5,1:5]
Albert Einstein Albert Mathiez Albert Rhys Williams Alberto Flores Galindo Alejandra Kollontai
Albert Einstein 1 0 0 0 1
Albert Mathiez 0 1 0 0 0
Albert Rhys Williams 0 0 1 1 0
Alberto Flores Galindo 0 0 1 1 0
Alejandra Kollontai 1 0 0 0 1
g = graph_from_adjacency_matrix(adjMat, weighted= NULL, mode="undirected", diag=FALSE)
V(g)$fecha <- dist_topicos_autor$fecha #agrego la fecha como atributo de cada autor
mean(degree(g))
[1] 57.93365
l <- layout_nicely(g)
plot(g,edge.arrow.size=.2, vertex.size=4,vertex.frame.color="#ffffff",
vertex.label="", vertex.label.color="black",
layout=l)
as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
mutate(importance = centrality_degree()) %>%
filter(importance >1) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha))+
geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
Using `nicely` as default layout
as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha))+
geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
Using `nicely` as default layout
grafo_plot <- as_tbl_graph(g) %>%
filter(fecha>1800) %>%
activate(nodes) %>%
# as_tbl_graph(g) %>%
# filter(fecha>1800,
# !degree(g)<2) %>%
ggraph() +
geom_edge_link(color='grey') +
geom_node_point(aes(color=fecha,label=name))+
geom_node_text(aes(label=name),check_overlap = T,nudge_y =-.5 ) +
theme_void()+
scale_color_gradient2(low = 'darkorange3',mid ='darkgreen' , high = 'dodgerblue', midpoint = 1900)
Using `nicely` as default layout
Ignoring unknown aesthetics: label
plotly::ggplotly(grafo_plot)
geom_GeomEdgePath() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues